/******************************************************************************
Author: Akshay Dixit
Date created: April 18, 2018
Data last modified: June 11, 2020
File name: 1_in.do
Project: T4D Indonesia 
Purpose: Baseline data on primary & secondary outcomes
******************************************************************************/

/*

	This .do file prepares a village-level dataset of average outcomes at baseline, for
	primary and secondary outcomes (see Pre-Analysis Plan for how these outcomes are defined).

	The purpose of preparing these average outcomes is to include Baseline village-level averages in 
	the final regression analyses, as a robustness check. 

	This average baseline outcome was calculated for the following outcomes:
	
		Primary
			- Delivery with a skilled birth attendant
			- Delivery at a health facility
			- Postnatal care (mother & baby)
			- Weight-for-age
			- Height-for-age
			- Content of Care index (Note: No measure for delivery content of care + Outcome components different at baseline/endline)
			- Empowerment: Participation index (Note: Outcome components different at baseline/endline)
			- Empowerment: Based on a self-efficacy question
		
		Secondary
			- First ANC visit within the first trimester (with a skilled provider)
			- Four or more ANC visits (with a skilled provider)
			- Birth weight
			- Antenatal Content of Care (Note: Outcome components different at baseline/endline)
			- Birth preparedness
		
	Data on the following outcome was not collected at baseline:
		Secondary
			- Maternal depression
	
	In addition, the .do file also computes a measures of baseline health facility quality, to be used
	in the subgroup analysis.
	This includes:
			1. Facility has specific room for deliveries
			2. Delivery room has some form of auditory and/or visual privacy
			3. Delivery room bed is clean
			4. Delivery room floor is clean
			5. Delivery room is well-ventilated or air-conditioned
			6. Biological/medical waste disposal available
			7. No dust/mold observed in delivery room
			8. Separate bin for placenta disposal
			9. Water is available to flush in toilet
			10. Water is available to wash hands in toilet
			11. Soap is available in toile
			12. MNH staff present 24*7			
			
*/


clear all
set more off

cd "$data"

u "Household_Merged_BL_IN.dta", clear

********************************************************************************
* HOUSEHOLD SURVEY
********************************************************************************

* Creating village IDs
tostring lk01_cd, replace
g length01 = strlen(lk01_cd)
tostring lk02_cd, replace
g length02 = strlen(lk02_cd)
tostring lk03_cd, replace
g length03 = strlen(lk03_cd)
tostring lk04_cd, replace
g length04 = strlen(lk04_cd)

replace lk01_cd = "0" + lk01_cd if length01 == 1
replace lk02_cd = "0" + lk02_cd if length02 == 1
replace lk03_cd = "0" + lk03_cd if length03 == 2
replace lk04_cd = "0" + lk04_cd if length04 == 2
replace lk04_cd = "00" + lk04_cd if length04 == 1

egen iddesa = concat(lk01_cd lk02_cd lk03_cd lk04_cd)

* Merge with treatment/strata

merge m:1 iddesa using "randomization_strata.dta"
g strata1 = (strata == 1)
g strata2 = (strata == 2)
g strata3 = (strata == 3)
g strata4 = (strata == 4)

********************************************************************************

* PRIMARY OUTCOMES

* Delivery with a skilled birth attendant
tab pf30
g provider = substr(pf30,1,1)
g bl_skilled_provider_birth = (provider == "A" | provider == "B" | provider == "C")
replace bl_skilled_provider_birth = . if pf30 == ""
drop provider


* Birth at a facility 
tab pf32a_x
g bl_facility_birth = (pf32a_x == 1)
replace bl_facility_birth = . if pf32a_x == .


* Postnatal care (mother & baby)

	//Skilled provider check received within 7 days
foreach i in `c(ALPHA)' {
	if "`i'" <= "J" {
	g postpartum_check_`i' = (pf53`i' == 1 & pf52`i' == 1)
	replace postpartum_check_`i' = . if pf51`i' == .
	tab postpartum_check_`i'
}
}

foreach i in `c(ALPHA)' {
	if "`i'" <= "G" {
	g postnatal_check_`i' = (pf56`i' == 1 & pf57`i' == 1)
	replace postnatal_check_`i' = . if pf55`i' == .
	tab postnatal_check_`i'
}
}

	//Whether at least one postpartum/postnatal check was received
g postpartum_check = (postpartum_check_A == 1 | postpartum_check_B == 1 | postpartum_check_C == 1 | postpartum_check_D == 1 | ///
postpartum_check_E == 1 | postpartum_check_F == 1 | postpartum_check_G == 1 | postpartum_check_H == 1 | postpartum_check_I  == 1 | postpartum_check_J == 1)
replace postpartum_check = . if (pf51A == . & pf51B == . & pf51C == . & pf51D == . & pf51E == . & pf51F == . & pf51G == . & pf51H == . & pf51I == . & pf51J == .)

g postnatal_check = (postnatal_check_A == 1 | postnatal_check_B == 1 | postnatal_check_C == 1 | postnatal_check_D == 1 | ///
postnatal_check_E == 1 | postnatal_check_F == 1 | postnatal_check_G == 1)
replace postnatal_check = . if (pf55A == . & pf55B == . & pf55C == . & pf55D == . & pf55E == . & pf55F == . & pf55G == .)

	//Combining the binary postpartum and postnatal care variables
g bl_postpartum_postnatal_care = (postpartum_check == 1 & postnatal_check == 1)
replace bl_postpartum_postnatal_care = . if (postpartum_check == . | postnatal_check == .)


* Weight-for-age
desc pf28_x* pf28_a* pf28_b* pf28_c*
count if pf28_a1 == .	//41 missing measurements
count if pf28_x2 != .	//16 twins
count if pf28_x3 != .	//No triplets
count if pf28_c1 != .	//132 cases with a third measurement
	
	/*
	It is going to be impossible to determine which of the twins was born first. 
	Plus, only village level averages are needed for the analysis, and 16 twins 
	won't sway averages much one way or another. 
	
	Hence, the below calculations focus on the first of the twins whose data was entered 
	- variables pf28_a1, pf28_b1 and pf28_c1.
	*/
	
	//Age of the last born child
	g sk14_day1 = 15 if sk14_yr1 != .
	g sk14_day2 = 15 if sk14_yr2 != .
	g sk14_day3 = 15 if sk14_yr3 != .
	count if sk14_yr3 < sk14_yr2 & (sk14_yr3 != . & sk14_yr2 != .)
	count if sk14_yr2 < sk14_yr1 & (sk14_yr2 != . & sk14_yr1 != .)
	g flag_yr = (sk14_yr2 < sk14_yr1 & (sk14_yr2 != . & sk14_yr1 != .))
	count if sk14_yr3 < sk14_yr1 & (sk14_yr3 != . & sk14_yr1 != .)

	 /*
	 Going from yr3 to yr2 to yr1, we go from youngest to oldest.
	 sk14_yr3 > sk14_yr2 > sk14_yr1, in general. 
	 This is violated in three cases, for which sk14_yr2 < sk14_yr1
	 
	 Hence, yr3 is going to give the year of birth of the youngest child.
	 If yr3 is missing, yr2 will give the year of birth of the youngest child.
	 If yr2 is also missing, yr1 will give the year of birth of the youngest child.
	 
	 This is true except for the aforementioned 3 cases, which are flagged by
	 the variable "flag_yr".
	 
	 */
	
		*Date of birth variable
	g date_birth1 = mdy(sk14_mth1, sk14_day1, sk14_yr1)
	format date_birth1 %d
	g date_birth2 = mdy(sk14_mth2, sk14_day2, sk14_yr2)
	format date_birth2 %d
	g date_birth3 = mdy(sk14_mth3, sk14_day3, sk14_yr3)
	format date_birth3 %d
	g date_birth = date_birth3
	replace date_birth = date_birth2 if date_birth3 == .
	replace date_birth = date_birth1 if date_birth == .
	replace date_birth = date_birth1 if flag_yr == 1
	format date_birth %d
	
		*Survey date variable
	g date_survey = mdy(mth_1, day_1, yr_1)
	format date_survey %d
	
		*Age of the infant
	g days_between = date_survey-date_birth
	g age_in_months = round(days_between/30) 
	drop days_between
	
	
	//Sex of the last born child
	desc sk12*
	g sex_of_infant = sk123
	replace sex_of_infant = sk122 if sk123 == .
	replace sex_of_infant = sk121 if sex_of_infant == .
	replace sex_of_infant = sk121 if flag_yr == 1
	drop flag_yr
	
	//Combine the different readings into a single measure
	g weight_diff_ab = abs(pf28_a1 - pf28_b1)
	g weight_diff_bc = abs(pf28_b1 - pf28_c1)
	g weight_diff_ac = abs(pf28_a1 - pf28_c1)
	egen least_weight_diff = rowmin(weight_diff_ab weight_diff_bc weight_diff_ac) 
	egen weight_ab = rowmean(pf28_a1 pf28_b1)
	egen weight_bc = rowmean(pf28_b1 pf28_c1)
	egen weight_ac = rowmean(pf28_a1 pf28_c1)
	g weight = .
	replace weight = weight_ab if least_weight_diff == weight_diff_ab
	replace weight = weight_bc if ((least_weight_diff == weight_diff_bc) & (pf28_c1 != .)) 
	replace weight = weight_ac if ((least_weight_diff == weight_diff_ac) & (pf28_c1 != .))
	drop *weight_diff* weight_*
	
	//Compute z-score
	egen bl_waz = zanthro(weight, wa, WHO), xvar(age_in_months) gender(sex_of_infant) gencode(male=1, female=3) ageunit(month)
		/*
		51 missing values generated
		41 are cases where the weight is missing.
		The remaining 10 are cases where the Z-score exceeds 5, and zanthro omits them
		*/
		
	//Binary outcome for underweight
	g bl_underweight = (bl_waz < -2)
	replace bl_underweight = . if bl_waz == .
	
		
* Height-for-age
desc pf29_a* pf29_b* pf29_c*
count if pf29_a1 == .	//44 missing measurements
count if pf29_x2 != .	//16 twins
count if pf29_x3 != .	//No triplets
count if pf29_c1 != .	//6 cases with a third measurement
	
	//Combine the different readings into a single measure
	egen height = rowmean(pf29_a1 pf29_b1)
	
	//Compute z-score
	egen bl_haz = zanthro(height, ha, WHO), xvar(age_in_months) gender(sex_of_infant) gencode(male=1, female=3) ageunit(month)
		/*
		59 missing values generated
		44 are cases where the weight is missing.
		The remaining 15 are cases where the Z-score exceeds 5, and zanthro omits them
		*/
	
	//Binary variable for stunting
	g bl_stunted = (bl_haz < -2)
	replace bl_stunted = . if bl_haz == .	
	
	
* Content of care

***Impute missing values with the treatment assignment group mean***
	
	*Postpartum content of care
local ppcc postpartum_check_A postpartum_check_B postpartum_check_C postpartum_check_D postpartum_check_E postpartum_check_F postpartum_check_G postpartum_check_H postpartum_check_I postpartum_check_J
	foreach var of local ppcc {
		tab `var'
		bysort treatment: egen mean_`var' = mean(`var')
		replace `var' = mean_`var' if `var' == .
		drop mean_`var'
	}

	*Postnatal content of care
local pncc postnatal_check_A postnatal_check_B postnatal_check_C postnatal_check_D postnatal_check_E postnatal_check_F postnatal_check_G
	foreach var of local pncc {
		bysort treatment: egen mean_`var' = mean(`var')
		replace `var' = mean_`var' if `var' == .
		*replace `var' = . if sk03 == 1	
			//The above line, if executed, would exclude children that were born alive but died later.
		drop mean_`var'
	}
	
***Individual content of care outcomes***
egen postpartum_content = rowtotal(postpartum_check_A postpartum_check_B postpartum_check_C postpartum_check_D postpartum_check_E postpartum_check_F postpartum_check_G postpartum_check_H postpartum_check_I postpartum_check_J), missing
egen postnatal_content = rowtotal(postnatal_check_A postnatal_check_B postnatal_check_C postnatal_check_D postnatal_check_E postnatal_check_F postnatal_check_G), missing
*replace postnatal_content = . if sk03 == 1
		//The above line, if executed, would exclude children that were born alive but died later.

tab postpartum_content
tab postnatal_content

***Content of care index***	

	*1. All variables are already oriented so that higher values represent "better" outcomes
	
	*2. Standardize each outcome - using the mean and s.d. of the control group
	local content postpartum_content postnatal_content
	foreach var of local content {
		sum `var' if treatment == 0
		scalar mean_`var' = r(mean)
		scalar sd_`var' = r(sd)
		g z_`var' = (`var' - mean_`var')/sd_`var'
	}
	
	*3. Missing value imputation already done
	
	
	*4. Compile summary index
	egen bl_z_content = rowmean(z_postpartum_content z_postnatal_content)
	sum bl_z_content
	
********************************************************************************

* Empowerment - participation
local participation ks23_a ks23_b ks23_c ks23_d ks23_e ks23_f ks18
foreach var of local participation {
	tab `var'
	replace `var' = . if (`var' == 7 | `var' == 8)
	g `var'_dummy = (`var' == 1)
	replace `var'_dummy = . if `var' == .
}
	
	*1. All variables are already oriented so that the higher value (1) represents the "better" outcome
	
	*3. Impute missing values at treatment assignment group mean
	local index_participation ks23_a_dummy ks23_b_dummy ks23_c_dummy ks23_d_dummy ks23_e_dummy ks23_f_dummy ks18_dummy
	foreach var of local index_participation {
		bysort treatment: egen mean_`var' = mean(`var')
		replace `var' = mean_`var' if `var' == .
	}
	drop mean_* 
	
	*2. Standardize each outcome - using the mean and s.d. of the control group
	local index_participation ks23_a_dummy ks23_b_dummy ks23_c_dummy ks23_d_dummy ks23_e_dummy ks23_f_dummy ks18_dummy
	foreach var of local index_participation {
		sum `var' if treatment == 0
		scalar mean_`var' = r(mean)
		scalar sd_`var' = r(sd)
		g z_`var' = (`var' - mean_`var')/sd_`var'
	}
	
	*4. Compile summary index
	egen bl_participation = rowmean(z_ks23_a_dummy z_ks23_b_dummy z_ks23_c_dummy z_ks23_d_dummy z_ks23_e_dummy z_ks23_f_dummy z_ks18_dummy) 
	sum bl_participation

********************************************************************************

* Empowerment - self-efficacy

ren ks17 bl_ks11
replace bl_ks11 = . if bl_ks11 == 7
tab bl_ks11

********************************************************************************

* SECONDARY OUTCOMES

* First ANC visit within the first trimester
ren pf04 bl_timing_anc_visit
g bl_anc_first_trimester = (bl_timing_anc_visit <= 13)
replace bl_anc_first_trimester = . if bl_timing_anc_visit == .

	//Instances where this visit was with an unskilled provider should be marked 0
g provider = substr(pf091,1,1)
g first_anc_skilled_provider = (provider == "A" | provider == "B" | provider == "C" | provider == "D")
replace first_anc_skilled_provider = . if provider == ""
drop provider
replace bl_anc_first_trimester = 0 if first_anc_skilled_provider == 0
	
	
* Four or more ANC visits (with a skilled provider)
local anc_skilled pf091 pf092 pf093 pf094 pf095 pf096
foreach var of local anc_skilled {
	g provider_`var' = substr(`var',1,1)
	g anc_skilled_`var' = (provider == "A" | provider == "B" | provider == "C" | provider == "D")
	replace anc_skilled_`var' = . if provider_`var' == ""
	drop provider_`var'
}

forvalues i = 1/6 {
g number_anc_visits`i' = anc_skilled_pf09`i' * pf10`i'
}

egen bl_number_anc_visits = rowtotal(number_anc_visits1 number_anc_visits2 number_anc_visits3 number_anc_visits4 number_anc_visits5 number_anc_visits6), missing 
g bl_number_anc_visits_binary = (bl_number_anc_visits >= 4)
replace bl_number_anc_visits_binary = . if bl_number_anc_visits == .
tab bl_number_anc_visits_binary


* Birth weight
g bl_low_birthweight = (pf251 < 2.5)
replace bl_low_birthweight = . if pf251 == .


* Birth preparedness			
local bp sp10_a sp10_b sp10_c sp10_d sp10_e sp10_f
foreach var of local bp {
	replace `var' = 0 if `var' == 3
	tab `var'
}
egen bl_birth_preparedness = rowtotal(sp10_a sp10_b sp10_c sp10_d sp10_e sp10_f), missing

* ANC content of care
desc pf13A pf13B pf13D pf17

local anc pf13A pf13B pf13D pf17
foreach var of local anc {
	replace `var' = 3 if `var' == 8	//Don't know replaced as "No"
	replace `var' = 0 if `var' == 3	//"No" recoded from 3 to 0
	tab `var'
}

egen bl_anc_content = rowtotal(pf13A pf13B pf13D pf17), missing
tab bl_anc_content

********************************************************************************

* CREATE DATA-SET WITH VILLAGE-LEVEL AVERAGES


*Calculating village-level averages
local outcomes bl_skilled_provider_birth bl_facility_birth bl_postpartum_postnatal_care ///
bl_participation bl_ks11 bl_z_content bl_stunted bl_underweight ///
bl_anc_first_trimester bl_number_anc_visits_binary bl_anc_content bl_low_birthweight bl_birth_preparedness

foreach var of local outcomes {
	bysort iddesa: egen m_`var' = mean(`var')
}

*Collapsing the data-set by village ID
contract iddesa m_bl_skilled_provider_birth m_bl_facility_birth m_bl_postpartum_postnatal_care ///
m_bl_participation m_bl_ks11 m_bl_z_content m_bl_stunted m_bl_underweight ///
m_bl_anc_first_trimester m_bl_number_anc_visits_binary m_bl_anc_content m_bl_low_birthweight m_bl_birth_preparedness

drop _freq

lab var m_bl_skilled_provider_birth "Baseline village-level average: Birth with a skilled provider" 
lab var m_bl_facility_birth "Baseline village-level average: Birth at a facility"
lab var m_bl_postpartum_postnatal_care "Baseline village-level average: Postpartum & postnatal care"
lab var m_bl_z_content "Baseline village-level average: Content of care (postpartum & postnatal)"
lab var m_bl_stunted "Baseline village-level average: Stunted"
lab var m_bl_underweight "Baseline village-level average: Underweight"
lab var m_bl_participation "Baseline Empowerment - Participation"
lab var m_bl_ks11 "Baseline Empowerment - Self Efficacy"

lab var m_bl_anc_first_trimester "Baseline village-level average: ANC within the first trimester"
lab var m_bl_number_anc_visits_binary "Baseline village-level average: 4 or more ANC visits"
lab var m_bl_low_birthweight "Baseline village-level average: Low birthweight"
lab var m_bl_birth_preparedness "Baseline village-level average: Birth preparedness"
lab var m_bl_anc_content "Baseline village-level average: Content of Antenatal Care"

save "baseline_outcomes_by_village.dta", replace

********************************************************************************

* BASELINE FACILITY QUALITY FOR SUBGROUP ANALYSIS

u "Facility_BL_IN.dta", clear

g room_for_deliveries = (kp01 == 1)
replace room_for_deliveries = . if kp01 == .

g delivery_room_privacy = (kp03 == 1 | kp03 == 2 | kp03 == 3)
replace delivery_room_privacy = . if kp03 == .

g delivery_room_bed_clean = (kp06_a == "A")
replace delivery_room_bed_clean = . if kp06_a == ""

g delivery_room_floor_clean = (kp06_b == "A")
replace delivery_room_floor_clean = . if kp06_b == ""

g delivery_room_ventilated = (kp06_c == 1 | kp06_c == 6)
replace delivery_room_ventilated = . if kp06_c == .

g delivery_room_waste = (kp06_d == 1)
replace delivery_room_waste = . if kp06_d == .

g delivery_room_dust = (kp06_e == 3)
replace delivery_room_dust = . if kp06_e == .

g placenta_bin = (kp07 == 1)
replace placenta_bin = . if kp07 == .

g toilet_water_flush = (kp09_a == 1)
replace toilet_water_flush = . if kp09_a == .

g toilet_water_handwash = (kp09_b == 1)
replace toilet_water_handwash = . if kp09_b == .

g toilet_soap = (kp09_c == 1)
replace toilet_soap = . if kp09_c == .

g mnh_staff_always_present = (kp11 == 1)
replace mnh_staff_always_present = . if kp11 == .

***Measures of facility quality*** 

egen capacity_a = rowtotal(room_for_deliveries delivery_room_privacy delivery_room_bed_clean delivery_room_floor_clean ///
delivery_room_ventilated delivery_room_waste delivery_room_dust placenta_bin toilet_water_flush toilet_water_handwash ///
toilet_soap mnh_staff_always_present), missing

keep iddesa fascode capacity_a

save "Baseline_facility_quality.dta", replace

********************************************************************************

clear

